• DOMAIN: Automobile
• CONTEXT: The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes
• DATA DESCRIPTION: The data concerns city-cycle fuel consumption in miles per gallon
• Attribute Information:
• PROJECT OBJECTIVE: Goal is to cluster the data and treat them as individual datasets to train Regression models to predict ‘mpg’ Steps and tasks: [ Total Score: 25 points]
import numpy as np
import pandas as pd
import json
from scipy.stats import zscore
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from random import sample
from numpy.random import uniform
from math import isnan
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from numpy import arange
from sklearn.linear_model import LinearRegression
# Load csv file into a dataframe
df_name = pd.read_csv("Part1 - Car name.csv")
# Display Shape and size of df_name dataframe
print("Shape of Car Name csv file :", df_name.shape)
print("Size of Car Name csv file :", df_name.size)
print()
# Open json file
with open('Part1 - Car-Attributes.json', 'r') as fp:
# Load json file into a dataframe
df_attr = pd.DataFrame(json.load(fp))
# Display shape and size of df_attr dataframe
print("Shape of Car attribute json file :", df_attr.shape)
print("Size of Car attribute json file :", df_attr.size)
Shape of Car Name csv file : (398, 1) Size of Car Name csv file : 398 Shape of Car attribute json file : (398, 8) Size of Car attribute json file : 3184
# Open json file
with open('Part1 - Car-Attributes.json', 'r') as fp:
# Read and Concatenate two files into one dataframe
df_master = pd.concat([pd.read_csv("Part1 - Car name.csv"), pd.DataFrame(json.load(fp))], axis=1)
# Display shape and size of master dataset
print("Shape of Dataset: ", df_master.shape)
print("Size of Dataset: ", df_master.size)
Shape of Dataset: (398, 9) Size of Dataset: 3582
# saving the dataframe as csv
df_master.to_csv('output.csv', index=False)
# saving the dataframe as xls
df_master.to_excel("output.xlsx", index=False)
# saving the dataframe as json
df_master.to_json('output.json')
# Loading the dataframe as csv
df_csv = pd.read_csv('output.csv')
# Loading the dataframe as xls
df_xl = pd.read_excel("output.xlsx")
# Loading the dataframe as json
df_json = pd.read_json('output.json')
# Display the csv dataframe
df_csv
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | ford mustang gl | 27.0 | 4 | 140.0 | 86 | 2790 | 15.6 | 82 | 1 |
| 394 | vw pickup | 44.0 | 4 | 97.0 | 52 | 2130 | 24.6 | 82 | 2 |
| 395 | dodge rampage | 32.0 | 4 | 135.0 | 84 | 2295 | 11.6 | 82 | 1 |
| 396 | ford ranger | 28.0 | 4 | 120.0 | 79 | 2625 | 18.6 | 82 | 1 |
| 397 | chevy s-10 | 31.0 | 4 | 119.0 | 82 | 2720 | 19.4 | 82 | 1 |
398 rows × 9 columns
# Display the xl dataframe
df_xl
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | ford mustang gl | 27.0 | 4 | 140.0 | 86 | 2790 | 15.6 | 82 | 1 |
| 394 | vw pickup | 44.0 | 4 | 97.0 | 52 | 2130 | 24.6 | 82 | 2 |
| 395 | dodge rampage | 32.0 | 4 | 135.0 | 84 | 2295 | 11.6 | 82 | 1 |
| 396 | ford ranger | 28.0 | 4 | 120.0 | 79 | 2625 | 18.6 | 82 | 1 |
| 397 | chevy s-10 | 31.0 | 4 | 119.0 | 82 | 2720 | 19.4 | 82 | 1 |
398 rows × 9 columns
# Display the json dataframe
df_json
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393 | ford mustang gl | 27.0 | 4 | 140.0 | 86 | 2790 | 15.6 | 82 | 1 |
| 394 | vw pickup | 44.0 | 4 | 97.0 | 52 | 2130 | 24.6 | 82 | 2 |
| 395 | dodge rampage | 32.0 | 4 | 135.0 | 84 | 2295 | 11.6 | 82 | 1 |
| 396 | ford ranger | 28.0 | 4 | 120.0 | 79 | 2625 | 18.6 | 82 | 1 |
| 397 | chevy s-10 | 31.0 | 4 | 119.0 | 82 | 2720 | 19.4 | 82 | 1 |
398 rows × 9 columns
# Viewing basic information of master dataset
df_master.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
# Going through all the car names
for r in df_master["car_name"]:
print(r)
chevrolet chevelle malibu buick skylark 320 plymouth satellite amc rebel sst ford torino ford galaxie 500 chevrolet impala plymouth fury iii pontiac catalina amc ambassador dpl dodge challenger se plymouth 'cuda 340 chevrolet monte carlo buick estate wagon (sw) toyota corona mark ii plymouth duster amc hornet ford maverick datsun pl510 volkswagen 1131 deluxe sedan peugeot 504 audi 100 ls saab 99e bmw 2002 amc gremlin ford f250 chevy c20 dodge d200 hi 1200d datsun pl510 chevrolet vega 2300 toyota corona ford pinto amc gremlin plymouth satellite custom chevrolet chevelle malibu ford torino 500 amc matador chevrolet impala pontiac catalina brougham ford galaxie 500 plymouth fury iii dodge monaco (sw) ford country squire (sw) pontiac safari (sw) amc hornet sportabout (sw) chevrolet vega (sw) pontiac firebird ford mustang mercury capri 2000 opel 1900 peugeot 304 fiat 124b toyota corolla 1200 datsun 1200 volkswagen model 111 plymouth cricket toyota corona hardtop dodge colt hardtop volkswagen type 3 chevrolet vega ford pinto runabout chevrolet impala pontiac catalina plymouth fury iii ford galaxie 500 amc ambassador sst mercury marquis buick lesabre custom oldsmobile delta 88 royale chrysler newport royal mazda rx2 coupe amc matador (sw) chevrolet chevelle concours (sw) ford gran torino (sw) plymouth satellite custom (sw) volvo 145e (sw) volkswagen 411 (sw) peugeot 504 (sw) renault 12 (sw) ford pinto (sw) datsun 510 (sw) toyouta corona mark ii (sw) dodge colt (sw) toyota corolla 1600 (sw) buick century 350 amc matador chevrolet malibu ford gran torino dodge coronet custom mercury marquis brougham chevrolet caprice classic ford ltd plymouth fury gran sedan chrysler new yorker brougham buick electra 225 custom amc ambassador brougham plymouth valiant chevrolet nova custom amc hornet ford maverick plymouth duster volkswagen super beetle chevrolet impala ford country plymouth custom suburb oldsmobile vista cruiser amc gremlin toyota carina chevrolet vega datsun 610 maxda rx3 ford pinto mercury capri v6 fiat 124 sport coupe chevrolet monte carlo s pontiac grand prix fiat 128 opel manta audi 100ls volvo 144ea dodge dart custom saab 99le toyota mark ii oldsmobile omega plymouth duster ford maverick amc hornet chevrolet nova datsun b210 ford pinto toyota corolla 1200 chevrolet vega chevrolet chevelle malibu classic amc matador plymouth satellite sebring ford gran torino buick century luxus (sw) dodge coronet custom (sw) ford gran torino (sw) amc matador (sw) audi fox volkswagen dasher opel manta toyota corona datsun 710 dodge colt fiat 128 fiat 124 tc honda civic subaru fiat x1.9 plymouth valiant custom chevrolet nova mercury monarch ford maverick pontiac catalina chevrolet bel air plymouth grand fury ford ltd buick century chevroelt chevelle malibu amc matador plymouth fury buick skyhawk chevrolet monza 2+2 ford mustang ii toyota corolla ford pinto amc gremlin pontiac astro toyota corona volkswagen dasher datsun 710 ford pinto volkswagen rabbit amc pacer audi 100ls peugeot 504 volvo 244dl saab 99le honda civic cvcc fiat 131 opel 1900 capri ii dodge colt renault 12tl chevrolet chevelle malibu classic dodge coronet brougham amc matador ford gran torino plymouth valiant chevrolet nova ford maverick amc hornet chevrolet chevette chevrolet woody vw rabbit honda civic dodge aspen se ford granada ghia pontiac ventura sj amc pacer d/l volkswagen rabbit datsun b-210 toyota corolla ford pinto volvo 245 plymouth volare premier v8 peugeot 504 toyota mark ii mercedes-benz 280s cadillac seville chevy c10 ford f108 dodge d100 honda accord cvcc buick opel isuzu deluxe renault 5 gtl plymouth arrow gs datsun f-10 hatchback chevrolet caprice classic oldsmobile cutlass supreme dodge monaco brougham mercury cougar brougham chevrolet concours buick skylark plymouth volare custom ford granada pontiac grand prix lj chevrolet monte carlo landau chrysler cordoba ford thunderbird volkswagen rabbit custom pontiac sunbird coupe toyota corolla liftback ford mustang ii 2+2 chevrolet chevette dodge colt m/m subaru dl volkswagen dasher datsun 810 bmw 320i mazda rx-4 volkswagen rabbit custom diesel ford fiesta mazda glc deluxe datsun b210 gx honda civic cvcc oldsmobile cutlass salon brougham dodge diplomat mercury monarch ghia pontiac phoenix lj chevrolet malibu ford fairmont (auto) ford fairmont (man) plymouth volare amc concord buick century special mercury zephyr dodge aspen amc concord d/l chevrolet monte carlo landau buick regal sport coupe (turbo) ford futura dodge magnum xe chevrolet chevette toyota corona datsun 510 dodge omni toyota celica gt liftback plymouth sapporo oldsmobile starfire sx datsun 200-sx audi 5000 volvo 264gl saab 99gle peugeot 604sl volkswagen scirocco honda accord lx pontiac lemans v6 mercury zephyr 6 ford fairmont 4 amc concord dl 6 dodge aspen 6 chevrolet caprice classic ford ltd landau mercury grand marquis dodge st. regis buick estate wagon (sw) ford country squire (sw) chevrolet malibu classic (sw) chrysler lebaron town @ country (sw) vw rabbit custom maxda glc deluxe dodge colt hatchback custom amc spirit dl mercedes benz 300d cadillac eldorado peugeot 504 oldsmobile cutlass salon brougham plymouth horizon plymouth horizon tc3 datsun 210 fiat strada custom buick skylark limited chevrolet citation oldsmobile omega brougham pontiac phoenix vw rabbit toyota corolla tercel chevrolet chevette datsun 310 chevrolet citation ford fairmont amc concord dodge aspen audi 4000 toyota corona liftback mazda 626 datsun 510 hatchback toyota corolla mazda glc dodge colt datsun 210 vw rabbit c (diesel) vw dasher (diesel) audi 5000s (diesel) mercedes-benz 240d honda civic 1500 gl renault lecar deluxe subaru dl vokswagen rabbit datsun 280-zx mazda rx-7 gs triumph tr7 coupe ford mustang cobra honda accord plymouth reliant buick skylark dodge aries wagon (sw) chevrolet citation plymouth reliant toyota starlet plymouth champ honda civic 1300 subaru datsun 210 mpg toyota tercel mazda glc 4 plymouth horizon 4 ford escort 4w ford escort 2h volkswagen jetta renault 18i honda prelude toyota corolla datsun 200sx mazda 626 peugeot 505s turbo diesel volvo diesel toyota cressida datsun 810 maxima buick century oldsmobile cutlass ls ford granada gl chrysler lebaron salon chevrolet cavalier chevrolet cavalier wagon chevrolet cavalier 2-door pontiac j2000 se hatchback dodge aries se pontiac phoenix ford fairmont futura amc concord dl volkswagen rabbit l mazda glc custom l mazda glc custom plymouth horizon miser mercury lynx l nissan stanza xe honda accord toyota corolla honda civic honda civic (auto) datsun 310 gx buick century limited oldsmobile cutlass ciera (diesel) chrysler lebaron medallion ford granada l toyota celica gt dodge charger 2.2 chevrolet camaro ford mustang gl vw pickup dodge rampage ford ranger chevy s-10
# Run through all the other columns
for col in df_master.columns[1:]:
print(col)
print(df_master[col].unique())
print()
mpg [18. 15. 16. 17. 14. 24. 22. 21. 27. 26. 25. 10. 11. 9. 28. 19. 12. 13. 23. 30. 31. 35. 20. 29. 32. 33. 17.5 15.5 14.5 22.5 24.5 18.5 29.5 26.5 16.5 31.5 36. 25.5 33.5 20.5 30.5 21.5 43.1 36.1 32.8 39.4 19.9 19.4 20.2 19.2 25.1 20.6 20.8 18.6 18.1 17.7 27.5 27.2 30.9 21.1 23.2 23.8 23.9 20.3 21.6 16.2 19.8 22.3 17.6 18.2 16.9 31.9 34.1 35.7 27.4 25.4 34.2 34.5 31.8 37.3 28.4 28.8 26.8 41.5 38.1 32.1 37.2 26.4 24.3 19.1 34.3 29.8 31.3 37. 32.2 46.6 27.9 40.8 44.3 43.4 36.4 44.6 40.9 33.8 32.7 23.7 23.6 32.4 26.6 25.8 23.5 39.1 39. 35.1 32.3 37.7 34.7 34.4 29.9 33.7 32.9 31.6 28.1 30.7 24.2 22.4 34. 38. 44. ] cyl [8 4 6 3 5] disp [307. 350. 318. 304. 302. 429. 454. 440. 455. 390. 383. 340. 400. 113. 198. 199. 200. 97. 110. 107. 104. 121. 360. 140. 98. 232. 225. 250. 351. 258. 122. 116. 79. 88. 71. 72. 91. 97.5 70. 120. 96. 108. 155. 68. 114. 156. 76. 83. 90. 231. 262. 134. 119. 171. 115. 101. 305. 85. 130. 168. 111. 260. 151. 146. 80. 78. 105. 131. 163. 89. 267. 86. 183. 141. 173. 135. 81. 100. 145. 112. 181. 144. ] hp [130 165 150 140 198 220 215 225 190 170 160 95 97 85 88 46 87 90 113 200 210 193 '?' 100 105 175 153 180 110 72 86 70 76 65 69 60 80 54 208 155 112 92 145 137 158 167 94 107 230 49 75 91 122 67 83 78 52 61 93 148 129 96 71 98 115 53 81 79 120 152 102 108 68 58 149 89 63 48 66 139 103 125 133 138 135 142 77 62 132 84 64 74 116 82] wt [3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 3563 3609 3761 3086 2372 2833 2774 2587 2130 1835 2672 2430 2375 2234 2648 4615 4376 4382 4732 2264 2228 2046 2634 3439 3329 3302 3288 4209 4464 4154 4096 4955 4746 5140 2962 2408 3282 3139 2220 2123 2074 2065 1773 1613 1834 1955 2278 2126 2254 2226 4274 4385 4135 4129 3672 4633 4502 4456 4422 2330 3892 4098 4294 4077 2933 2511 2979 2189 2395 2288 2506 2164 2100 4100 3988 4042 3777 4952 4363 4237 4735 4951 3821 3121 3278 2945 3021 2904 1950 4997 4906 4654 4499 2789 2279 2401 2379 2124 2310 2472 2265 4082 4278 1867 2158 2582 2868 3399 2660 2807 3664 3102 2875 2901 3336 2451 1836 2542 3781 3632 3613 4141 4699 4457 4638 4257 2219 1963 2300 1649 2003 2125 2108 2246 2489 2391 2000 3264 3459 3432 3158 4668 4440 4498 4657 3907 3897 3730 3785 3039 3221 3169 2171 2639 2914 2592 2702 2223 2545 2984 1937 3211 2694 2957 2671 1795 2464 2572 2255 2202 4215 4190 3962 3233 3353 3012 3085 2035 3651 3574 3645 3193 1825 1990 2155 2565 3150 3940 3270 2930 3820 4380 4055 3870 3755 2045 1945 3880 4060 4140 4295 3520 3425 3630 3525 4220 4165 4325 4335 1940 2740 2755 2051 2075 1985 2190 2815 2600 2720 1800 2070 3365 3735 3570 3535 3155 2965 3430 3210 3380 3070 3620 3410 3445 3205 4080 2560 2230 2515 2745 2855 2405 2830 3140 2795 2135 3245 2990 2890 3265 3360 3840 3725 3955 3830 4360 4054 3605 1925 1975 1915 2670 3530 3900 3190 3420 2200 2150 2020 2595 2700 2556 2144 1968 2120 2019 2678 2870 3003 3381 2188 2711 2434 2110 2800 2085 2335 2950 3250 1850 2145 1845 2910 2420 2500 2905 2290 2490 2635 2620 2725 2385 1755 1875 1760 2050 2215 2380 2320 2210 2350 2615 3230 3160 2900 3415 3060 3465 2605 2640 2575 2525 2735 2865 3035 1980 2025 1970 2160 2205 2245 1965 1995 3015 2585 2835 2665 2370 2790 2295 2625] acc [12. 11.5 11. 10.5 10. 9. 8.5 8. 9.5 15. 15.5 16. 14.5 20.5 17.5 12.5 14. 13.5 18.5 19. 13. 19.5 18. 17. 23.5 16.5 21. 16.9 14.9 17.7 15.3 13.9 12.8 15.4 17.6 22.2 22.1 14.2 17.4 16.2 17.8 12.2 16.4 13.6 15.7 13.2 21.9 16.7 12.1 14.8 18.6 16.8 13.7 11.1 11.4 18.2 15.8 15.9 14.1 21.5 14.4 19.4 19.2 17.2 18.7 15.1 13.4 11.2 14.7 16.6 17.3 15.2 14.3 20.1 24.8 11.3 12.9 18.8 18.1 17.9 21.7 23.7 19.9 21.8 13.8 12.6 16.1 20.7 18.3 20.4 19.6 17.1 15.6 24.6 11.6] yr [70 71 72 73 74 75 76 77 78 79 80 81 82] origin [1 3 2]
df_master[df_master["hp"] == "?"]
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 32 | ford pinto | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 | 1 |
| 126 | ford maverick | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 | 1 |
| 330 | renault lecar deluxe | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 | 2 |
| 336 | ford mustang cobra | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 | 1 |
| 354 | renault 18i | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 | 2 |
| 374 | amc concord dl | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 | 1 |
# Create a mask to store the index values of the records with "?"
index_mask = df_master[df_master["hp"] == "?"].index
# Create the train and prediction set for KNN Regressor
X_imp = df_master.drop(["car_name", "hp", "yr", "origin"], axis=1).apply(zscore)
X_imp_train = X_imp.drop(index_mask)
X_imp_pred = X_imp.iloc[index_mask, :]
y_imp_train = df_master[df_master["hp"] != "?"]["hp"]
# Building the model
classifier = KNeighborsRegressor(n_neighbors=19, metric="minkowski", p=2)
classifier.fit(X_imp_train, y_imp_train)
# Predict values to impute
y_pred = classifier.predict(X_imp_pred)
# Create a new dataframe called auto so as to keep master intact
df_auto = df_master.copy()
# Assign rounded prediction values to the respective missing value
df_auto.loc[index_mask, "hp"] = np.around(list(y_pred), decimals=0)
# Convert feature "hp" to int type
df_auto["hp"] = df_auto["hp"].astype("int64")
# Display the final dataset information
df_auto.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null int64 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(5), object(1) memory usage: 28.1+ KB
# Drop the car_name column
df_auto.drop("car_name", axis=1, inplace=True)
# Drop the car_name column
df_auto_lr = df_auto.copy()
# Creating a copy as mean_auto dataset so as to keep master intact
df_mean_auto = df_master.copy()
df_mean_auto[df_mean_auto["hp"] == "?"] = np.around(df_mean_auto["hp"][df_mean_auto["hp"] != "?"].mean())
# Convert feature "hp" to int type
df_mean_auto["hp"] = df_mean_auto["hp"].astype("int64")
for col in ['cyl', 'yr', 'origin']:
print("Column name: ", col)
print("Discrete Values: ", df_auto[col].unique())
print("No. of values :", len(df_auto[col].unique()))
print("With equal distribution No. of records on average with unique value: ", df_auto.shape[0]/len(df_auto[col].unique()))
print("Proportion of sample with similar attribute: %2.0f%%" %(100/len(df_auto[col].unique())))
print()
Column name: cyl Discrete Values: [8 4 6 3 5] No. of values : 5 With equal distribution No. of records on average with unique value: 79.6 Proportion of sample with similar attribute: 20% Column name: yr Discrete Values: [70 71 72 73 74 75 76 77 78 79 80 81 82] No. of values : 13 With equal distribution No. of records on average with unique value: 30.615384615384617 Proportion of sample with similar attribute: 8% Column name: origin Discrete Values: [1 3 2] No. of values : 3 With equal distribution No. of records on average with unique value: 132.66666666666666 Proportion of sample with similar attribute: 33%
# Combining feature year and origin
df_auto["cat"] = df_auto["yr"]*10 + df_auto["origin"]
# Get the unique values and its frequency counts
key, val = np.unique(df_auto["cat"], return_counts=True)
# Replace attributes of feature cat with the relative frequency of each attribute
df_auto["cat"].replace(dict(zip(key, val/df_auto.shape[0])), inplace=True)
# Dropping features yr and origin
df_auto.drop(["yr", "origin"], axis=1, inplace=True)
# Display the statistics of remaining features
df_auto.describe()
| mpg | cyl | disp | hp | wt | acc | cat | |
|---|---|---|---|---|---|---|---|
| count | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 |
| mean | 23.514573 | 5.454774 | 193.425879 | 104.103015 | 2970.424623 | 15.568090 | 0.038888 |
| std | 7.815984 | 1.701004 | 104.269838 | 38.339159 | 846.841774 | 2.757689 | 0.019323 |
| min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 0.005025 |
| 25% | 17.500000 | 4.000000 | 104.250000 | 75.000000 | 2223.750000 | 13.825000 | 0.020101 |
| 50% | 23.000000 | 4.000000 | 148.500000 | 92.500000 | 2803.500000 | 15.500000 | 0.045226 |
| 75% | 29.000000 | 8.000000 | 262.000000 | 125.000000 | 3608.000000 | 17.175000 | 0.055276 |
| max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 0.072864 |
print("To understand our new feature called cat (short for categorical) :-")
print("With equal distribution No. of records on average with unique value: ", df_auto.shape[0]/len(key))
print("Proportion of sample with similar attribute: %2.5f%%" %(100/len(key)))
To understand our new feature called cat (short for categorical) :- With equal distribution No. of records on average with unique value: 10.205128205128204 Proportion of sample with similar attribute: 2.56410%
For our test we will consider :-
Any value above 0.7 as high tendency to cluster
Any value above 0.5 and less than 0.7 as low tendency to cluster
Any value below or equal to 0.5 will require us to reject the dataset at its present form
# Define a function for hopkins test
def hopkins(X):
d = X.shape[1]
#d - len(vars)
n = len(X) #rows
m = int(0.1*n)
nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
rand_X = sample(range(0, n, 1), m)
ujd = []
wjd = []
for j in range(0, m):
u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
ujd.append(u_dist[0][1])
w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
wjd.append(w_dist[0][1])
H = sum(ujd) / (sum(ujd) + sum(wjd))
if isnan(H):
print("Null Present")
print(ujd, wjd)
H = 0
return(H)
# Scale auto dataset
scale = StandardScaler()
# Run hopkins test on scaled data
print("Hopkins test statistics is : ", hopkins(pd.DataFrame(scale.fit_transform(df_auto))))
Hopkins test statistics is : 0.860931072649194
# Displaying the correlation map
f, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(df_auto.corr(), annot=True, cmap='Reds', ax=ax)
plt.show()
sns.pairplot(df_auto)
<seaborn.axisgrid.PairGrid at 0x185731a83a0>
# Initialise PCA
pca = PCA(svd_solver='randomized', random_state=42)
# Fit PCA to the required features after scaling
pca.fit(pd.DataFrame(scale.fit_transform(df_auto.iloc[:, :5])))
# Display the screeplot to determine how many componenets to keep
fig = plt.figure(figsize = (12, 8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("Number of Components")
plt.xticks(range(len(pca.explained_variance_ratio_)), range(1, len(pca.explained_variance_ratio_)+1))
plt.ylabel("Cumulative Explaince Variance")
plt.title("Screeplot")
plt.grid()
plt.show()
# Display amount of variance explained
print("First 3 components explain %2.5f%% of the variance of the selected features"
%(sum(pca.explained_variance_ratio_[:3])*100))
First 3 components explain 97.38471% of the variance of the selected features
# Reinitialising PCA with the required number of components
pca = PCA(n_components=3, svd_solver='randomized', random_state=42)
# Assigning new features to the auto dataset and dropping compressed features
df_auto.loc[:, ["PC1", "PC2", "PC3"]] = pca.fit_transform(pd.DataFrame(scale.fit_transform(df_auto.iloc[:, :5])))
df_auto.drop(df_auto.columns[:5], axis=1, inplace = True)
# Displaying the new pairplot
sns.pairplot(df_auto)
<seaborn.axisgrid.PairGrid at 0x185733222e0>
# Displaying the correlation map
f, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(df_auto.corr(), annot=True, cmap='Reds', ax=ax)
plt.show()
# Displaying the boxplot
sns.boxplot(data=df_auto)
<AxesSubplot:>
# Apply Zscore to all columns in dataset auto
df_auto = df_auto.apply(zscore)
# Create a list with index values of all rows with atleast one outlier (Value > 3 standard deviations)
outlier_mask = df_auto[(~(df_auto<3).all(axis=1))].index
# Reinitialising PCA with 3 components
pca = PCA(n_components=3, svd_solver='randomized', random_state=42)
# Assigning new features to the auto dataset and dropping compressed features
df_auto_vis = pd.DataFrame(pca.fit_transform(df_auto), columns=["Vis1", "Vis2", "Vis3"])
# 3D plots of clusters
fig = plt.figure(figsize=(8, 6))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=20, azim=60, auto_add_to_figure=False)
fig.add_axes(ax)
ax.scatter(df_auto_vis.iloc[:, 0], df_auto_vis.iloc[:, 1], df_auto_vis.iloc[:, 2], edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_title('3D plot of KMeans Clustering')
Text(0.5, 0.92, '3D plot of KMeans Clustering')
#Finding optimal no. of clusters
clusters=range(1,10)
meanDistortions=[]
# Run a loop for different number of clusters
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(df_auto)
prediction=model.predict(df_auto)
meanDistortions.append(sum(np.min(cdist(df_auto, model.cluster_centers_, 'euclidean'), axis=1)) / df_auto.shape[0])
# Plot the mean distortion against the number of clusters
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
C:\Users\utath\anaconda3\envs\Udemypython\lib\site-packages\sklearn\cluster\_kmeans.py:881: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
Text(0.5, 1.0, 'Selecting k with the Elbow Method')
# Set the a list of number of clusters to check
range_n_clusters = [3, 4, 5, 6, 7]
# Run a loop for each number of cluster
for n_clusters in range_n_clusters:
# Create a subplot for sihoutte diagram
fig, ax1 = plt.subplots()
fig.set_size_inches(12, 7)
# The 1st subplot is the silhouette plot
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(df_auto.drop(outlier_mask)) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 42 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = clusterer.fit_predict(df_auto.drop(outlier_mask))
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(df_auto.drop(outlier_mask), cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(df_auto.drop(outlier_mask), cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
# 2nd Plot showing the actual clusters formed
# 3D plots of clusters
fig2 = plt.figure(figsize=(8, 7))
ax2 = Axes3D(fig2, rect=[0, 0, .95, 1], elev=20, azim=60, auto_add_to_figure=False)
fig2.add_axes(ax2)
labels = clusterer.labels_
ax2.scatter(df_auto_vis.drop(outlier_mask).iloc[:, 0], df_auto_vis.drop(outlier_mask).iloc[:, 1],
df_auto_vis.drop(outlier_mask).iloc[:, 2],c=labels.astype(float), edgecolor='k')
ax2.w_xaxis.set_ticklabels([])
ax2.w_yaxis.set_ticklabels([])
ax2.w_zaxis.set_ticklabels([])
ax2.set_title('3D plot of KMeans Clustering')
# Labeling the clusters
centers = clusterer.cluster_centers_
ax1.set_title(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
For n_clusters = 3 The average silhouette_score is : 0.2909176179922629 For n_clusters = 4 The average silhouette_score is : 0.3000507346184173 For n_clusters = 5 The average silhouette_score is : 0.2859484009933728 For n_clusters = 6 The average silhouette_score is : 0.2975672577055842 For n_clusters = 7 The average silhouette_score is : 0.29604348195354596
#### generate the linkage matrix
Z = linkage(df_auto, 'ward', metric='euclidean')
Z.shape
(397, 4)
# Display the Dendogram
plt.figure(figsize=(25, 10))
dendrogram(Z)
plt.show()
# Display only the last 10 merged clusters
dendrogram(
Z,
truncate_mode='lastp',
p=25,
)
plt.show()
disp = 0
tmp = 0
n = 100
for dist in arange(5.0, 35.0, 0.25):
clusters = fcluster(Z, dist, criterion='distance')
if len(np.unique(clusters)) < n:
if (dist-tmp) > disp:
disp = dist - tmp
tmp = dist
max_dist = dist
n = len(np.unique(clusters))
print("No. of cluster %d, distance %2.2f & gap %2.2f " %(n, max_dist, disp))
No. of cluster 19, distance 5.00 & gap 5.00 No. of cluster 8, distance 10.25 & gap 5.25 No. of cluster 4, distance 15.75 & gap 5.50 No. of cluster 3, distance 21.50 & gap 5.75 No. of cluster 2, distance 27.50 & gap 6.00 No. of cluster 1, distance 33.75 & gap 6.25
# Use Kmeans to cluster
model=KMeans(n_clusters=4)
model.fit(df_auto.drop(outlier_mask))
# Predict cluster into a new column
df_auto_lr["cluster"] = model.predict(df_auto)
# Create a barplot sided by side to visualise the properties of different features of each cluster
g = sns.catplot(data=df_auto_lr.drop("wt",axis=1),
col="cluster",
kind="bar")
# Use Kmeans to cluster
model=KMeans(n_clusters=4)
model.fit(df_auto.drop(outlier_mask))
# Predict cluster into a new column
df_auto_lr["cluster"] = model.predict(df_auto)
# Initiate Regression Model
regression_model = LinearRegression()
# Loop through all clusters
for clus in range(len(np.unique(df_auto_lr["cluster"]))):
# Drop outliers
data = df_auto_lr.drop(outlier_mask)
# Filter the required cluster
data = data[data["cluster"] == clus]
# Create dummy variables and drop one dummy variable
data = pd.get_dummies(data, columns=['origin']).iloc[:, :-1]
data = pd.get_dummies(data, columns=['yr']).iloc[:, :-1]
# Seperate Independent and dependent variable
y = data.pop("mpg")
X = data.drop("cluster", axis=1)
regression_model.fit(X, y)
# Display Coeeficients by cluster number
print("Cluster ", clus)
intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))
for idx, col_name in enumerate(X.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))
print()
Cluster 0 The intercept for our model is 39.20768792460946 The coefficient for cyl is 0.9450775836509743 The coefficient for disp is -0.007783625385544016 The coefficient for hp is 0.016035676366644897 The coefficient for wt is -0.007314829367812868 The coefficient for acc is 0.44893395995900376 The coefficient for origin_1 is 0.8163544577265083 The coefficient for origin_2 is 1.7065497963403742 The coefficient for yr_70 is -14.4576117952446 The coefficient for yr_71 is -7.213219368870247 The coefficient for yr_73 is -11.048435460796302 The coefficient for yr_74 is -6.7339446464080766 The coefficient for yr_75 is -4.855804780619515 The coefficient for yr_76 is -4.667582854543182 The coefficient for yr_77 is -4.671892452502854 The coefficient for yr_78 is -1.8166558274209563 The coefficient for yr_79 is -1.9942576303805222 The coefficient for yr_80 is 0.5655979035769553 The coefficient for yr_81 is -1.1289037600759182 Cluster 1 The intercept for our model is 58.69185666778816 The coefficient for cyl is -2.6588159435111858 The coefficient for disp is -0.006496465275101994 The coefficient for hp is -0.04800339851655819 The coefficient for wt is 7.863106876224136e-05 The coefficient for acc is -0.6159854589321525 The coefficient for origin_1 is -4.182311585410156 The coefficient for yr_70 is -1.5526486749763886 The coefficient for yr_71 is -2.218890165189857 The coefficient for yr_72 is -1.8916517435176428 The coefficient for yr_73 is -2.517938596550922 The coefficient for yr_74 is -1.3296735360895828 The coefficient for yr_75 is -1.346864547872428 The coefficient for yr_76 is -1.386716444927693 The coefficient for yr_77 is 0.30568834187579796 The coefficient for yr_78 is 1.1007309133007215 The coefficient for yr_79 is 1.8567442842747008 The coefficient for yr_80 is 4.182311585410156 Cluster 2 The intercept for our model is 43.29767920781346 The coefficient for cyl is 0.24445094020632271 The coefficient for disp is -0.03162984550436107 The coefficient for hp is -0.013451096318243873 The coefficient for wt is -0.0027034618194529004 The coefficient for acc is -0.20793388548840408 The coefficient for origin_1 is -1.2448532385718254 The coefficient for yr_70 is -4.940345781215566 The coefficient for yr_71 is -4.7375546043711445 The coefficient for yr_72 is -5.881303069063963 The coefficient for yr_73 is -5.569951815329602 The coefficient for yr_74 is -4.07528546776457 The coefficient for yr_75 is -4.180364406362384 The coefficient for yr_76 is -2.2625985383094864 The coefficient for yr_77 is -2.581126032368077 The coefficient for yr_78 is -2.1582150834030855 The coefficient for yr_79 is -0.588341495428679 The coefficient for yr_80 is -3.0634496331528647 The coefficient for yr_81 is -2.8792060351589637 Cluster 3 The intercept for our model is 42.680332896969816 The coefficient for cyl is -0.41355586437777386 The coefficient for disp is 0.04479501595144906 The coefficient for hp is -0.02985322228825362 The coefficient for wt is -0.007590845605799107 The coefficient for acc is 0.04403710692365966 The coefficient for origin_1 is -0.28383763676780455 The coefficient for origin_2 is 0.3970558874578619 The coefficient for yr_70 is -0.855884502582116 The coefficient for yr_71 is -1.2548937789626722 The coefficient for yr_72 is -2.4844942306756366 The coefficient for yr_73 is -3.8581007574596735 The coefficient for yr_74 is -0.8113096943261487 The coefficient for yr_75 is -0.14603035388904653 The coefficient for yr_76 is -0.6947699353999661 The coefficient for yr_77 is -0.24512529818602463 The coefficient for yr_78 is -1.74638033648807 The coefficient for yr_80 is 1.9788359703321698 The coefficient for yr_81 is 1.408040360502273
hc = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'ward')
# Predict cluster into a new column
df_auto_lr["cluster"] = hc.fit_predict(df_auto)
# Initiate Regression Model
regression_model = LinearRegression()
# Loop through all clusters
for clus in range(len(np.unique(df_auto_lr["cluster"]))):
# Drop outliers
data = df_auto_lr.drop(outlier_mask)
# Filter the required cluster
data = data[data["cluster"] == clus]
# Create dummy variables and drop one dummy variable
data = pd.get_dummies(data, columns=['origin']).iloc[:, :-1]
data = pd.get_dummies(data, columns=['yr']).iloc[:, :-1]
# Seperate Independent and dependent variable
y = data.pop("mpg")
X = data.drop("cluster", axis=1)
regression_model.fit(X, y)
# Display Coeeficients by cluster number
print("Cluster ", clus)
intercept = regression_model.intercept_
print("The intercept for our model is {}".format(intercept))
for idx, col_name in enumerate(X.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))
print()
Cluster 0 The intercept for our model is 49.5069465843585 The coefficient for cyl is 0.7013317817678006 The coefficient for disp is 0.0063259091829923645 The coefficient for hp is -0.05742803096804463 The coefficient for wt is -0.007825841380684165 The coefficient for acc is 0.22543343478119404 The coefficient for origin_1 is -1.1245782699899796 The coefficient for origin_2 is 0.5486832620636003 The coefficient for yr_70 is -8.723724210325994 The coefficient for yr_71 is -7.879379228235578 The coefficient for yr_72 is -9.876587311974243 The coefficient for yr_73 is -10.985469518482265 The coefficient for yr_74 is -7.946870471911074 The coefficient for yr_75 is -6.908816845265908 The coefficient for yr_76 is -7.559428099326569 The coefficient for yr_77 is -6.509650160595093 The coefficient for yr_78 is -5.227056349844411 The coefficient for yr_79 is -1.0398082679181442 The coefficient for yr_80 is -0.2311651294462274 The coefficient for yr_81 is -1.6756910829787226 Cluster 1 The intercept for our model is 30.614309673282094 The coefficient for cyl is 0.3206945118501934 The coefficient for disp is -0.005818935581672498 The coefficient for hp is -0.03672533603112909 The coefficient for wt is -0.0003256215891673451 The coefficient for acc is -0.5257583966467397 The coefficient for yr_70 is -3.169664512182851 The coefficient for yr_71 is -3.5610872053010048 The coefficient for yr_72 is -3.4813774618143642 The coefficient for yr_73 is -3.975172089951653 The coefficient for yr_74 is -2.6099509201282074 The coefficient for yr_75 is -2.1146327797813838 The coefficient for yr_76 is -2.9784855586352115 The coefficient for yr_77 is -1.3396702745388538 The coefficient for yr_78 is 0.7323067098672117 Cluster 2 The intercept for our model is 46.37406120588807 The coefficient for cyl is 0.8457960109904605 The coefficient for disp is -0.04853550283872698 The coefficient for hp is -0.03231175031732552 The coefficient for wt is -0.002614065538753767 The coefficient for acc is -0.319599580997723 The coefficient for yr_70 is -6.286450967052853 The coefficient for yr_71 is -5.414553726876452 The coefficient for yr_72 is -6.91787421666696 The coefficient for yr_73 is -6.516347510861006 The coefficient for yr_74 is -4.741582353392111 The coefficient for yr_75 is -4.851136570975401 The coefficient for yr_76 is -2.8262386316686308 The coefficient for yr_77 is -2.3107789044822726 The coefficient for yr_78 is -3.1188413543925906 The coefficient for yr_79 is -1.9699765386840222 The coefficient for yr_81 is -5.526434756156734
• DOMAIN: Manufacturing • CONTEXT: Company X curates and packages wine across various vineyards spread throughout the country. • DATA DESCRIPTION: The data concerns the chemical composition of the wine and its respective quality. Attribute Information:
• PROJECT OBJECTIVE: Goal is to build a synthetic data generation model using the existing data provided by the company.
import pandas as pd
# Load csv file into a dataframe
df_name = pd.read_excel("Part2 - Company.xlsx")
# Building a class for imputing
class Myimputer(object):
def __init__(self, null_columns=[]):
self.null_columns = null_columns
def _check_data_type(self, ser):
# This function returns the datatype of column
return(ser.dtype.name)
def fit(self, df):
# This function checks which columns have null value
# Null_columns value can be assigned by the user, this checks if null_columns was provided by user
if not self.null_columns:
for col in df.columns:
if df[col].isnull().any():
# Create the null columns list
self.null_columns.append(col)
else:
# Check if null-columns values provided by user is actually null
for col in self.null_columns:
if df[col].notnull().all():
print("Error: Incorrect null_columns property added")
return(self.null_columns)
def replace(self, df):
# This functions replace categorical columns null values with most frequent value and numerical column with median value
# Check if null columns values exist in dataframe
if all(item in df.columns for item in self.null_columns):
for col in self.null_columns:
data = self._check_data_type(df[col])
if data == "categorical" or data == "object":
# Convert null to mode values
idx = df[df[col].isnull()].index
df.loc[idx, col] = df[col][df[col].notnull()].mode()[0]
elif data in ["int32", "int64", "float32", "float64"]:
# Convert null to median values
idx = df[df[col].isnull()].index
df.loc[idx, col] = df[col][df[col].notnull()].median()
else:
print("Could not recognise datatype %s. Please change to int, float, object or categorical" %(data))
else:
print("Name mismatch between fit and replace")
return(df)
def fit_replace(self, df):
# This function checks values and imputes value together
# Null_columns value can be assigned by the user, this checks if null_columns was provided by user
if not self.null_columns:
for col in df.columns:
if df[col].isnull().any():
data = self._check_data_type(df[col])
if data == "categorical" or data == "object":
# Convert null to mode values
idx = df[df[col].isnull()].index
df.loc[idx, col] = df[col][df[col].notnull()].mode()[0]
elif data in ["int32", "int64", "float32", "float64"]:
# Convert null to median values
idx = df[df[col].isnull()].index
df.loc[idx, col] = df[col][df[col].notnull()].median()
else:
print("Could not recognise datatype %s. Please change to int, float, object or categorical" %(data))
else:
# Check if null-columns values provided by user is actually null
for col in self.null_columns:
if df[col].notnull().all():
print("Error: Incorrect null_columns property added")
else:
df = self.replace(df)
return(df)
# Initialise object
imp = Myimputer()
# Impute Null values
imp.fit_replace(df_name)
| A | B | C | D | Quality | |
|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A |
| 1 | 174 | 133 | 134 | 166 | Quality B |
| 2 | 159 | 163 | 135 | 131 | Quality A |
| 3 | 61 | 23 | 3 | 44 | Quality A |
| 4 | 59 | 60 | 9 | 68 | Quality A |
| ... | ... | ... | ... | ... | ... |
| 56 | 200 | 186 | 185 | 179 | Quality B |
| 57 | 137 | 182 | 165 | 199 | Quality A |
| 58 | 88 | 39 | 9 | 102 | Quality A |
| 59 | 180 | 157 | 192 | 198 | Quality A |
| 60 | 157 | 135 | 135 | 156 | Quality A |
61 rows × 5 columns
# Display basic information
df_name.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 61 non-null int64 1 B 61 non-null int64 2 C 61 non-null int64 3 D 61 non-null int64 4 Quality 61 non-null object dtypes: int64(4), object(1) memory usage: 2.5+ KB
• DOMAIN: Automobile
• CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
• DATA DESCRIPTION: The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
• All the features are numeric i.e. geometric features extracted from the silhouette.
• PROJECT OBJECTIVE: Apply dimensionality reduction technique – PCA and train a model using principal components instead of training the model using just the raw data.
import pandas as pd
import numpy as np
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_ind, shapiro
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load csv file into a dataframe
df_vehicle = pd.read_csv("Part3 - vehicle.csv")
df_vehicle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
# Dropping records with a ny Null value
df_clean = df_vehicle.drop(df_vehicle[df_vehicle.isnull().any(axis=1)].index).reset_index(drop=True)
# Go through all columns and print unique values and the count
for col in df_clean.columns:
print()
print(col)
print(np.unique(df_clean[col], return_counts=True))
compactness
(array([ 73, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
114, 115, 116, 117, 119], dtype=int64), array([ 1, 1, 2, 4, 5, 12, 11, 19, 17, 20, 41, 46, 28, 33, 56, 41, 38,
25, 42, 35, 27, 27, 29, 30, 17, 24, 21, 16, 18, 26, 17, 18, 18, 12,
13, 7, 4, 2, 3, 1, 3, 1, 1, 1], dtype=int64))
circularity
(array([33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45.,
46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58.,
59.]), array([ 2, 7, 16, 40, 42, 46, 41, 41, 35, 45, 56, 48, 55, 47, 35, 30, 28,
15, 27, 27, 30, 37, 31, 14, 12, 5, 1], dtype=int64))
distance_circularity
(array([ 40., 42., 44., 47., 49., 50., 51., 52., 53., 54., 55.,
57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 68.,
69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90.,
91., 92., 93., 94., 95., 96., 98., 100., 101., 102., 103.,
104., 105., 106., 107., 108., 109., 110., 112.]), array([ 1, 1, 1, 1, 1, 2, 6, 2, 8, 5, 2, 5, 8, 3, 11, 1, 8,
12, 13, 6, 46, 28, 15, 41, 19, 23, 20, 16, 23, 22, 25, 22, 10, 15,
14, 15, 19, 15, 26, 9, 5, 18, 9, 7, 7, 6, 6, 3, 4, 19, 22,
22, 27, 6, 27, 20, 26, 9, 13, 17, 15, 4, 1], dtype=int64))
radius_ratio
(array([104., 105., 109., 110., 111., 112., 113., 114., 115., 116., 117.,
118., 119., 120., 121., 122., 123., 124., 125., 126., 127., 128.,
129., 130., 131., 132., 133., 134., 135., 136., 137., 138., 139.,
140., 141., 142., 143., 144., 145., 146., 147., 148., 149., 150.,
151., 152., 153., 154., 155., 156., 157., 158., 159., 160., 161.,
162., 163., 164., 165., 166., 167., 168., 169., 170., 171., 172.,
173., 174., 175., 176., 177., 178., 179., 180., 181., 182., 183.,
184., 185., 186., 187., 188., 189., 190., 191., 192., 193., 194.,
195., 196., 197., 198., 199., 200., 201., 202., 203., 204., 205.,
206., 207., 208., 209., 210., 211., 212., 213., 214., 215., 216.,
217., 218., 219., 220., 221., 222., 223., 224., 225., 226., 227.,
228., 230., 231., 232., 234., 235., 238., 246., 250., 252., 306.,
322., 333.]), array([ 1, 1, 1, 3, 4, 1, 4, 4, 4, 6, 4, 2, 5, 9, 7, 5, 10,
3, 11, 8, 6, 4, 5, 12, 6, 5, 11, 3, 5, 11, 9, 7, 12, 9,
11, 6, 6, 7, 6, 6, 9, 7, 7, 12, 8, 6, 6, 9, 8, 6, 6,
9, 9, 9, 8, 13, 6, 8, 8, 8, 5, 4, 11, 9, 8, 7, 6, 6,
4, 7, 8, 5, 7, 7, 6, 4, 10, 9, 6, 12, 6, 7, 6, 5, 11,
6, 9, 10, 9, 4, 15, 3, 12, 3, 11, 7, 10, 8, 5, 9, 6, 4,
11, 3, 8, 5, 8, 2, 4, 3, 1, 2, 6, 4, 4, 5, 3, 1, 4,
1, 2, 5, 4, 6, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1],
dtype=int64))
pr.axis_aspect_ratio
(array([ 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57.,
58., 59., 60., 61., 62., 63., 64., 65., 66., 67., 68.,
69., 70., 71., 72., 73., 74., 75., 76., 97., 102., 103.,
105., 126., 133., 138.]), array([ 2, 4, 3, 4, 10, 14, 26, 37, 34, 54, 40, 41, 62, 45, 42, 55, 43,
66, 38, 35, 27, 33, 25, 18, 15, 10, 7, 9, 5, 1, 1, 1, 1, 1,
2, 1, 1], dtype=int64))
max.length_aspect_ratio
(array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 22, 25, 43, 46,
48, 49, 52, 55], dtype=int64), array([ 1, 4, 18, 49, 124, 161, 108, 91, 107, 105, 30, 3, 1,
2, 1, 1, 1, 1, 2, 2, 1], dtype=int64))
scatter_ratio
(array([112., 114., 115., 116., 117., 118., 119., 120., 121., 122., 123.,
125., 126., 127., 128., 129., 130., 131., 132., 133., 134., 135.,
136., 137., 138., 139., 140., 141., 142., 143., 144., 145., 146.,
147., 148., 149., 150., 151., 152., 153., 154., 155., 156., 157.,
158., 159., 160., 161., 162., 163., 164., 165., 166., 167., 168.,
169., 170., 171., 172., 173., 174., 175., 176., 177., 178., 179.,
180., 181., 183., 184., 185., 186., 187., 188., 189., 190., 191.,
192., 193., 194., 195., 196., 197., 198., 199., 200., 201., 202.,
203., 204., 205., 206., 207., 208., 209., 210., 211., 212., 213.,
214., 215., 216., 217., 218., 219., 220., 221., 222., 223., 224.,
225., 226., 227., 234., 236., 237., 238., 239., 240., 241., 247.,
250., 251., 252., 257., 260., 261., 262., 265.]), array([ 1, 4, 2, 3, 2, 5, 6, 3, 1, 8, 5, 3, 4, 6, 9, 2, 9,
6, 9, 11, 8, 11, 7, 9, 5, 9, 10, 2, 12, 6, 9, 8, 12, 14,
18, 25, 32, 28, 18, 17, 13, 16, 12, 20, 9, 11, 11, 12, 7, 6, 6,
2, 4, 7, 3, 7, 2, 4, 3, 2, 2, 6, 5, 9, 6, 1, 1, 4,
4, 3, 6, 6, 3, 5, 3, 2, 2, 5, 5, 1, 4, 3, 5, 6, 2,
5, 8, 3, 4, 4, 4, 5, 7, 8, 3, 4, 5, 9, 10, 10, 6, 7,
10, 11, 10, 9, 10, 10, 2, 4, 4, 2, 3, 1, 1, 1, 1, 1, 1,
2, 2, 2, 1, 1, 3, 1, 1, 1, 1], dtype=int64))
elongatedness
(array([26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51.,
52., 53., 54., 55., 56., 57., 58., 59., 61.]), array([ 8, 7, 6, 2, 48, 69, 43, 28, 19, 25, 18, 18, 18, 17, 19, 21, 27,
58, 69, 70, 53, 14, 22, 16, 24, 18, 20, 10, 10, 10, 6, 12, 3, 4,
1], dtype=int64))
pr.axis_rectangularity
(array([17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29.]), array([ 42, 127, 227, 111, 47, 45, 52, 85, 56, 8, 5, 7, 1],
dtype=int64))
max.length_rectangularity
(array([118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 186,
188], dtype=int64), array([ 2, 2, 1, 1, 3, 2, 8, 12, 10, 17, 18, 13, 13, 19, 15, 13, 24,
16, 12, 13, 15, 19, 13, 20, 19, 32, 36, 35, 22, 26, 22, 16, 13, 9,
13, 10, 7, 12, 18, 11, 18, 13, 12, 14, 18, 17, 9, 12, 9, 13, 12,
8, 8, 11, 12, 11, 7, 6, 3, 6, 5, 1, 2, 2, 1, 1],
dtype=int64))
scaled_variance
(array([130., 131., 132., 134., 135., 136., 137., 138., 139., 140., 141.,
142., 143., 144., 145., 146., 147., 148., 149., 150., 151., 152.,
153., 154., 155., 156., 157., 158., 159., 160., 161., 162., 163.,
164., 165., 166., 167., 168., 169., 170., 171., 172., 173., 174.,
175., 176., 177., 178., 179., 180., 181., 182., 183., 184., 185.,
186., 187., 188., 189., 190., 191., 192., 193., 194., 195., 196.,
197., 199., 200., 202., 203., 204., 205., 206., 207., 208., 209.,
210., 211., 212., 213., 214., 215., 216., 217., 218., 219., 220.,
221., 222., 223., 224., 225., 226., 227., 228., 229., 230., 231.,
232., 234., 235., 236., 237., 238., 240., 241., 243., 246., 254.,
258., 262., 263., 264., 265., 266., 267., 269., 272., 275., 278.,
280., 285., 287., 288., 320.]), array([ 1, 1, 1, 1, 6, 2, 5, 3, 4, 4, 5, 4, 3, 3, 4, 3, 6,
8, 3, 3, 5, 5, 3, 7, 6, 5, 5, 7, 10, 7, 7, 8, 11, 12,
10, 14, 14, 17, 28, 31, 17, 16, 24, 16, 19, 12, 11, 9, 13, 11, 11,
4, 3, 14, 4, 4, 2, 4, 13, 5, 3, 2, 2, 2, 2, 5, 10, 6,
9, 11, 2, 2, 3, 3, 8, 7, 3, 3, 2, 4, 1, 13, 4, 8, 11,
5, 13, 9, 10, 10, 14, 9, 11, 15, 6, 14, 13, 1, 10, 11, 5, 2,
1, 2, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 1, 2, 2, 1, 1, 1], dtype=int64))
scaled_variance.1
(array([ 184., 191., 192., 193., 194., 195., 196., 200., 203.,
204., 205., 206., 207., 208., 209., 211., 212., 213.,
216., 218., 219., 220., 221., 222., 223., 224., 225.,
227., 229., 230., 232., 233., 237., 238., 240., 241.,
242., 243., 245., 246., 247., 249., 250., 251., 252.,
253., 254., 255., 256., 258., 259., 260., 261., 262.,
264., 265., 266., 268., 269., 270., 271., 272., 273.,
274., 275., 277., 278., 279., 280., 281., 282., 283.,
284., 285., 286., 287., 289., 290., 291., 293., 294.,
295., 296., 297., 298., 299., 300., 301., 304., 305.,
306., 307., 308., 309., 310., 311., 312., 313., 314.,
315., 317., 318., 319., 320., 321., 322., 323., 324.,
325., 326., 327., 328., 329., 330., 331., 332., 333.,
334., 335., 336., 337., 338., 339., 340., 341., 342.,
343., 344., 345., 346., 347., 348., 349., 350., 351.,
352., 353., 354., 355., 356., 357., 358., 359., 360.,
361., 362., 363., 364., 365., 366., 367., 368., 369.,
370., 371., 372., 373., 374., 375., 376., 378., 379.,
381., 382., 383., 385., 387., 388., 389., 390., 391.,
393., 394., 395., 396., 399., 401., 402., 404., 405.,
406., 408., 409., 413., 414., 415., 416., 417., 418.,
419., 422., 425., 426., 427., 428., 429., 430., 433.,
434., 435., 438., 440., 444., 445., 446., 450., 452.,
455., 457., 458., 459., 460., 462., 463., 465., 466.,
467., 469., 471., 472., 473., 474., 476., 478., 479.,
480., 481., 484., 485., 486., 487., 489., 492., 494.,
504., 506., 508., 511., 512., 513., 517., 518., 519.,
520., 521., 523., 524., 525., 526., 527., 530., 533.,
534., 535., 536., 543., 545., 546., 552., 557., 558.,
559., 561., 562., 563., 567., 570., 571., 572., 573.,
574., 575., 576., 578., 579., 583., 584., 586., 587.,
589., 595., 596., 597., 598., 600., 601., 602., 604.,
605., 607., 608., 610., 611., 612., 613., 616., 621.,
622., 623., 624., 625., 627., 628., 629., 630., 631.,
633., 635., 636., 637., 638., 639., 640., 641., 642.,
644., 645., 648., 650., 653., 657., 658., 659., 660.,
661., 663., 664., 665., 666., 667., 668., 669., 670.,
671., 673., 674., 675., 676., 677., 678., 680., 681.,
682., 683., 684., 685., 686., 687., 688., 691., 692.,
693., 694., 696., 697., 698., 700., 701., 703., 704.,
705., 706., 707., 708., 709., 710., 711., 712., 713.,
716., 717., 718., 719., 720., 721., 722., 725., 726.,
727., 728., 729., 730., 731., 732., 735., 737., 741.,
748., 752., 756., 757., 766., 776., 816., 822., 838.,
844., 855., 857., 866., 870., 891., 892., 904., 923.,
928., 954., 956., 968., 982., 987., 998., 1018.]), array([1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 3, 1, 2, 1, 1, 1, 2, 1,
2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 3, 1, 5, 3, 3, 1, 3,
3, 4, 1, 1, 1, 2, 5, 3, 4, 3, 2, 4, 4, 3, 1, 1, 1, 1, 1, 2, 2, 2,
2, 3, 2, 4, 2, 1, 4, 1, 1, 1, 2, 4, 1, 1, 2, 3, 2, 1, 2, 2, 1, 1,
1, 3, 1, 3, 3, 3, 1, 3, 4, 1, 5, 1, 6, 4, 5, 3, 6, 7, 4, 4, 7, 6,
8, 3, 3, 7, 7, 6, 8, 6, 2, 4, 4, 3, 5, 5, 7, 5, 2, 3, 4, 2, 5, 2,
6, 3, 6, 4, 1, 7, 4, 3, 3, 3, 2, 1, 3, 5, 4, 2, 5, 4, 7, 1, 2, 2,
5, 2, 6, 2, 2, 1, 1, 3, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 3, 1,
1, 2, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 4, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2,
1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1,
1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 2, 1,
3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 4,
2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 3, 2, 1, 2, 1, 2,
1, 4, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1,
3, 2, 2, 1, 1, 1, 2, 3, 1, 4, 3, 1, 4, 2, 2, 3, 3, 1, 1, 3, 2, 1,
1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1], dtype=int64))
scaled_radius_of_gyration
(array([109., 112., 113., 114., 115., 116., 117., 118., 119., 120., 121.,
123., 124., 125., 126., 127., 128., 129., 130., 131., 132., 133.,
134., 135., 136., 137., 138., 139., 140., 141., 142., 143., 144.,
145., 146., 147., 148., 149., 150., 151., 152., 153., 154., 155.,
156., 157., 158., 159., 160., 161., 162., 163., 164., 165., 166.,
167., 168., 169., 170., 171., 172., 173., 174., 175., 176., 177.,
178., 179., 180., 181., 182., 183., 184., 185., 186., 187., 188.,
189., 190., 191., 192., 193., 194., 195., 196., 197., 198., 199.,
200., 201., 202., 203., 204., 205., 206., 207., 208., 209., 210.,
211., 212., 213., 214., 215., 216., 217., 218., 219., 220., 221.,
222., 223., 224., 226., 228., 229., 230., 231., 232., 234., 235.,
236., 237., 238., 239., 240., 241., 242., 243., 244., 245., 246.,
247., 249., 250., 255., 257., 260., 261., 262., 268.]), array([ 1, 3, 1, 1, 2, 2, 3, 2, 4, 2, 7, 8, 6, 5, 2, 11, 6,
4, 5, 3, 6, 5, 5, 5, 6, 9, 8, 13, 7, 6, 6, 7, 11, 12,
4, 5, 8, 7, 7, 10, 6, 8, 4, 6, 6, 12, 11, 10, 8, 8, 10,
5, 8, 8, 7, 3, 9, 5, 5, 20, 12, 13, 16, 5, 18, 13, 12, 8,
5, 2, 5, 7, 9, 15, 23, 9, 9, 7, 5, 5, 4, 4, 4, 5, 3,
5, 8, 6, 9, 9, 6, 6, 7, 4, 4, 2, 1, 3, 6, 1, 7, 7,
13, 3, 10, 7, 12, 6, 5, 5, 5, 6, 4, 2, 2, 2, 2, 2, 2,
1, 2, 1, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 1, 2, 1,
1, 1, 2, 1, 1], dtype=int64))
scaled_radius_of_gyration.1
(array([ 59., 60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80.,
81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91.,
97., 99., 118., 119., 127., 135.]), array([ 1, 2, 11, 18, 23, 34, 31, 38, 52, 37, 43, 51, 68, 75, 44, 50, 37,
23, 20, 17, 11, 17, 17, 16, 13, 7, 21, 11, 10, 5, 1, 2, 1, 1,
1, 1, 1, 1, 1], dtype=int64))
skewness_about
(array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
13., 14., 15., 16., 17., 18., 19., 20., 21., 22.]), array([77, 76, 61, 55, 69, 66, 62, 59, 47, 45, 36, 28, 30, 25, 16, 19, 10,
10, 6, 4, 3, 5, 4], dtype=int64))
skewness_about.1
(array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 38., 39.,
40., 41.]), array([29, 36, 34, 30, 39, 34, 35, 41, 32, 39, 27, 46, 28, 29, 35, 22, 31,
20, 16, 20, 20, 29, 19, 18, 13, 11, 9, 7, 13, 11, 8, 5, 6, 5,
1, 4, 3, 5, 1, 1, 1], dtype=int64))
skewness_about.2
(array([176., 177., 178., 179., 180., 181., 182., 183., 184., 185., 186.,
187., 188., 189., 190., 191., 192., 193., 194., 195., 196., 197.,
198., 199., 200., 201., 202., 203., 204., 206.]), array([ 3, 5, 4, 26, 37, 35, 25, 39, 32, 29, 51, 60, 60, 58, 42, 40, 44,
32, 26, 31, 24, 23, 20, 18, 14, 17, 8, 7, 2, 1], dtype=int64))
hollows_ratio
(array([181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206,
207, 208, 209, 210, 211], dtype=int64), array([ 1, 23, 38, 35, 24, 15, 19, 15, 15, 15, 26, 21, 28, 31, 43, 50, 50,
50, 50, 37, 44, 33, 29, 25, 24, 22, 11, 15, 12, 8, 4],
dtype=int64))
class
(array(['bus', 'car', 'van'], dtype=object), array([205, 413, 195], dtype=int64))
# View the miltivariate pairplot
sns.pairplot(data=df_clean, hue="class")
<seaborn.axisgrid.PairGrid at 0x1857bce32b0>
# Creating a function which will return required outputs after performing shapiro test
def significance_test(x, y):
# If Shapiro test clears both groups (Confidence of 95%) then perform welch test else display appropriate message
if (shapiro(x)[1]<0.05) and (shapiro(y)[1]<0.05):
# Welch's Test
t, p = ttest_ind(x, y, equal_var = False)
else:
p = np.nan
return(p)
# Creating analysis dictionary to store results
analysis = {}
# Creating a column of values 1 so that product of 2 and single columns are also considered
df_clean.insert(df_clean.shape[1]-1, '', 1)
# Performing Welch's t test on normal and abnormal groups for all independent variables
# Running a for loop to extract each attribute name individually
for idx1 in range(len(df_clean.columns[:-2])):
# Assign col1 identity
col1 = df_clean.columns[idx1]
for idx2 in range(len(df_clean.columns[:-1])):
# Assign col2 identity
col2 = df_clean.columns[idx2]
# Avoiding duplicates and cancelling off through if condition
if not (idx2==df_clean.shape[1]-2 or idx1==idx2):
# Creating 3 groups based on Dependent variable labels for division
group1 = df_clean[col1][df_clean["class"]=="car"]/df_clean[col2][df_clean["class"]=="car"]
group2 = df_clean[col1][df_clean["class"]=="bus"]/df_clean[col2][df_clean["class"]=="bus"]
group3 = df_clean[col1][df_clean["class"]=="van"]/df_clean[col2][df_clean["class"]=="van"]
# Storing results in analysis dictionary
analysis[col1+'/'+col2] = []
analysis[col1+'/'+col2].append(significance_test(group1, group2))
analysis[col1+'/'+col2].append(significance_test(group3, group2))
analysis[col1+'/'+col2].append(significance_test(group1, group3))
for idx3 in range(len(df_clean.columns[:-1])):
# Assign col3 identity
col3 = df_clean.columns[idx3]
if idx1<=idx2 and idx2<=idx3:
# Creating 3 groups based on Dependent variable labels for product
group1 = df_clean[col1][df_clean["class"]=="car"]*df_clean[col2][df_clean["class"]=="car"]*df_clean[col3][df_clean["class"]=="car"]
group2 = df_clean[col1][df_clean["class"]=="bus"]*df_clean[col2][df_clean["class"]=="bus"]*df_clean[col3][df_clean["class"]=="bus"]
group3 = df_clean[col1][df_clean["class"]=="van"]*df_clean[col2][df_clean["class"]=="van"]*df_clean[col3][df_clean["class"]=="van"]
# Storing results in analysis dictionary
analysis[col1+'*'+col2+'*'+col3] = []
analysis[col1+'*'+col2+'*'+col3].append(significance_test(group1, group2))
analysis[col1+'*'+col2+'*'+col3].append(significance_test(group3, group2))
analysis[col1+'*'+col2+'*'+col3].append(significance_test(group1, group3))
# Create a dataframe from dictionary for convenience
df_analysis = pd.DataFrame(analysis, index=["CarvsBus", "VanvsBus", "CarvsVan"]).T
# Store the top 5 lowest p value total of three groups in a list
ls_best = df_analysis[(df_analysis<0.05).all(axis=1)].sum(axis=1).sort_values().head().index
# Loop through the best combination names from list to assess which column to operate on
for comb in ls_best:
# Split names into required column names if division
if '/' in comb:
tmp = comb.split('/')
# Create the new feature
df_clean.insert(0, comb, df_clean[tmp[0]]/df_clean[tmp[1]])
# Split names into required column names if product
elif '*' in comb:
tmp = comb.split('*')
# Create the new feature
df_clean.insert(0, comb, df_clean[tmp[0]]*df_clean[tmp[1]]*df_clean[tmp[2]])
# Drop the column of 1's as it is no longer required
df_clean.drop('', axis=1, inplace=True)
# Display stats of new dataset
df_clean.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| elongatedness*scaled_radius_of_gyration.1*skewness_about.2 | 813.0 | 560184.307503 | 121441.605617 | 379890.000000 | 441210.000000 | 568832.000000 | 640250.000000 | 1091340.0 |
| radius_ratio/max.length_aspect_ratio | 813.0 | 21.546997 | 6.393115 | 3.942308 | 17.909091 | 20.444444 | 23.666667 | 59.5 |
| compactness*elongatedness*scaled_radius_of_gyration.1 | 813.0 | 274185.654367 | 47658.159836 | 203252.000000 | 235840.000000 | 269568.000000 | 295856.000000 | 545670.0 |
| distance_circularity*elongatedness* | 813.0 | 3250.584256 | 278.248811 | 2440.000000 | 3060.000000 | 3240.000000 | 3430.000000 | 4095.0 |
| scatter_ratio/max.length_aspect_ratio | 813.0 | 21.610382 | 6.881016 | 2.527273 | 18.363636 | 20.500000 | 23.111111 | 62.5 |
| compactness | 813.0 | 93.656827 | 8.233751 | 73.000000 | 87.000000 | 93.000000 | 100.000000 | 119.0 |
| circularity | 813.0 | 44.803198 | 6.146659 | 33.000000 | 40.000000 | 44.000000 | 49.000000 | 59.0 |
| distance_circularity | 813.0 | 82.043050 | 15.783070 | 40.000000 | 70.000000 | 79.000000 | 98.000000 | 112.0 |
| radius_ratio | 813.0 | 169.098401 | 33.615402 | 104.000000 | 141.000000 | 167.000000 | 195.000000 | 333.0 |
| pr.axis_aspect_ratio | 813.0 | 61.774908 | 7.973000 | 47.000000 | 57.000000 | 61.000000 | 65.000000 | 138.0 |
| max.length_aspect_ratio | 813.0 | 8.599016 | 4.677174 | 2.000000 | 7.000000 | 8.000000 | 10.000000 | 55.0 |
| scatter_ratio | 813.0 | 168.563346 | 33.082186 | 112.000000 | 146.000000 | 157.000000 | 198.000000 | 265.0 |
| elongatedness | 813.0 | 40.988930 | 7.803380 | 26.000000 | 33.000000 | 43.000000 | 46.000000 | 61.0 |
| pr.axis_rectangularity | 813.0 | 20.558426 | 2.573184 | 17.000000 | 19.000000 | 20.000000 | 23.000000 | 29.0 |
| max.length_rectangularity | 813.0 | 147.891759 | 14.504648 | 118.000000 | 137.000000 | 146.000000 | 159.000000 | 188.0 |
| scaled_variance | 813.0 | 188.377614 | 31.165873 | 130.000000 | 167.000000 | 179.000000 | 217.000000 | 320.0 |
| scaled_variance.1 | 813.0 | 438.382534 | 175.270368 | 184.000000 | 318.000000 | 364.000000 | 586.000000 | 1018.0 |
| scaled_radius_of_gyration | 813.0 | 174.252153 | 32.332161 | 109.000000 | 149.000000 | 173.000000 | 198.000000 | 268.0 |
| scaled_radius_of_gyration.1 | 813.0 | 72.399754 | 7.475994 | 59.000000 | 67.000000 | 71.000000 | 75.000000 | 135.0 |
| skewness_about | 813.0 | 6.351784 | 4.921476 | 0.000000 | 2.000000 | 6.000000 | 9.000000 | 22.0 |
| skewness_about.1 | 813.0 | 12.687577 | 8.926951 | 0.000000 | 6.000000 | 11.000000 | 19.000000 | 41.0 |
| skewness_about.2 | 813.0 | 188.979090 | 6.153681 | 176.000000 | 184.000000 | 189.000000 | 193.000000 | 206.0 |
| hollows_ratio | 813.0 | 195.729397 | 7.398781 | 181.000000 | 191.000000 | 197.000000 | 201.000000 | 211.0 |
# Performing Tukey's hsd test on different groups, based on Class variable, for all independent variables
# Running a for loop to extract each attribute name individually
for col in df_clean.columns[:-1]:
# Printing newline for cosmetic purposes
print("\n", col, "\n")
# Display Tuhey hsd test results
print(pairwise_tukeyhsd(df_clean[col], df_clean["class"], alpha=0.05))
elongatedness*scaled_radius_of_gyration.1*skewness_about.2
Multiple Comparison of Means - Tukey HSD, FWER=0.05
==============================================================
group1 group2 meandiff p-adj lower upper reject
--------------------------------------------------------------
bus car -74605.3121 0.001 -95404.729 -53805.8953 True
bus van 80159.5831 0.001 55807.0223 104512.144 True
car van 154764.8953 0.001 133612.0722 175917.7183 True
--------------------------------------------------------------
radius_ratio/max.length_aspect_ratio
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car -5.4619 0.001 -6.573 -4.3508 True
bus van -9.0068 0.001 -10.3077 -7.7059 True
car van -3.5449 0.001 -4.6749 -2.4149 True
----------------------------------------------------
compactness*elongatedness*scaled_radius_of_gyration.1
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=============================================================
group1 group2 meandiff p-adj lower upper reject
-------------------------------------------------------------
bus car -26223.117 0.001 -34275.2685 -18170.9656 True
bus van 37274.6946 0.001 27847.002 46702.3871 True
car van 63497.8116 0.001 55308.8448 71686.7784 True
-------------------------------------------------------------
distance_circularity*elongatedness*
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=====================================================
group1 group2 meandiff p-adj lower upper reject
-----------------------------------------------------
bus car 261.3113 0.001 215.7503 306.8724 True
bus van 455.973 0.001 402.6288 509.3172 True
car van 194.6617 0.001 148.3265 240.9968 True
-----------------------------------------------------
scatter_ratio/max.length_aspect_ratio
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car -6.0861 0.001 -7.2724 -4.8998 True
bus van -9.9029 0.001 -11.2919 -8.514 True
car van -3.8168 0.001 -5.0233 -2.6104 True
----------------------------------------------------
compactness
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car 4.7665 0.001 3.201 6.332 True
bus van -1.0114 0.3996 -2.8443 0.8215 False
car van -5.7779 0.001 -7.37 -4.1858 True
----------------------------------------------------
circularity
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car 1.1872 0.0506 -0.0023 2.3767 False
bus van -2.8876 0.001 -4.2802 -1.4949 True
car van -4.0748 0.001 -5.2845 -2.8651 True
----------------------------------------------------
distance_circularity
Multiple Comparison of Means - Tukey HSD, FWER=0.05
======================================================
group1 group2 meandiff p-adj lower upper reject
------------------------------------------------------
bus car 12.5486 0.001 9.7279 15.3692 True
bus van -3.3957 0.0422 -6.6982 -0.0933 True
car van -15.9443 0.001 -18.8128 -13.0758 True
------------------------------------------------------
radius_ratio
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=====================================================
group1 group2 meandiff p-adj lower upper reject
-----------------------------------------------------
bus car 14.2476 0.001 8.0897 20.4054 True
bus van -19.6982 0.001 -26.908 -12.4884 True
car van -33.9458 0.001 -40.2082 -27.6833 True
-----------------------------------------------------
pr.axis_aspect_ratio
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car -2.7782 0.001 -4.362 -1.1944 True
bus van -2.559 0.0036 -4.4133 -0.7046 True
car van 0.2192 0.9 -1.3915 1.83 False
----------------------------------------------------
max.length_aspect_ratio
Multiple Comparison of Means - Tukey HSD, FWER=0.05
==================================================
group1 group2 meandiff p-adj lower upper reject
--------------------------------------------------
bus car 1.8133 0.001 0.8945 2.7321 True
bus van 2.6838 0.001 1.608 3.7596 True
car van 0.8705 0.074 -0.0639 1.8049 False
--------------------------------------------------
scatter_ratio
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=====================================================
group1 group2 meandiff p-adj lower upper reject
-----------------------------------------------------
bus car 12.1108 0.001 6.3074 17.9143 True
bus van -27.7757 0.001 -34.5706 -20.9809 True
car van -39.8866 0.001 -45.7886 -33.9845 True
-----------------------------------------------------
elongatedness
Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj lower upper reject
---------------------------------------------------
bus car -2.2052 0.001 -3.5443 -0.866 True
bus van 7.7766 0.001 6.2087 9.3446 True
car van 9.9818 0.001 8.6198 11.3437 True
---------------------------------------------------
pr.axis_rectangularity
Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj lower upper reject
---------------------------------------------------
bus car 1.057 0.001 0.6003 1.5137 True
bus van -1.924 0.001 -2.4587 -1.3892 True
car van -2.981 0.001 -3.4454 -2.5165 True
---------------------------------------------------
max.length_rectangularity
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car 3.7657 0.0062 0.8862 6.6452 True
bus van -1.1662 0.6791 -4.5376 2.2051 False
car van -4.9319 0.001 -7.8603 -2.0035 True
----------------------------------------------------
scaled_variance
Multiple Comparison of Means - Tukey HSD, FWER=0.05
======================================================
group1 group2 meandiff p-adj lower upper reject
------------------------------------------------------
bus car 6.0796 0.0293 0.4852 11.674 True
bus van -28.1416 0.001 -34.6916 -21.5915 True
car van -34.2212 0.001 -39.9106 -28.5318 True
------------------------------------------------------
scaled_variance.1
Multiple Comparison of Means - Tukey HSD, FWER=0.05
========================================================
group1 group2 meandiff p-adj lower upper reject
--------------------------------------------------------
bus car 59.0864 0.001 28.0776 90.0952 True
bus van -146.2439 0.001 -182.5499 -109.9379 True
car van -205.3303 0.001 -236.8659 -173.7946 True
--------------------------------------------------------
scaled_radius_of_gyration
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=====================================================
group1 group2 meandiff p-adj lower upper reject
-----------------------------------------------------
bus car 0.0897 0.9 -6.1041 6.2835 False
bus van -22.6898 0.001 -29.9417 -15.438 True
car van -22.7795 0.001 -29.0785 -16.4805 True
-----------------------------------------------------
scaled_radius_of_gyration.1
Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj lower upper reject
---------------------------------------------------
bus car -6.9192 0.001 -8.307 -5.5313 True
bus van -3.9962 0.001 -5.6212 -2.3713 True
car van 2.9229 0.001 1.5115 4.3343 True
---------------------------------------------------
skewness_about
Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj lower upper reject
---------------------------------------------------
bus car 2.4015 0.001 1.433 3.3699 True
bus van 1.6683 0.0017 0.5344 2.8022 True
car van -0.7332 0.1883 -1.7181 0.2517 False
---------------------------------------------------
skewness_about.1
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car 4.8398 0.001 3.1239 6.5557 True
bus van -0.652 0.7078 -2.6611 1.357 False
car van -5.4918 0.001 -7.2369 -3.7468 True
----------------------------------------------------
skewness_about.2
Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj lower upper reject
---------------------------------------------------
bus car 1.433 0.0175 0.2027 2.6633 True
bus van 0.8233 0.3735 -0.6172 2.2637 False
car van -0.6098 0.4886 -1.861 0.6414 False
---------------------------------------------------
hollows_ratio
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
bus car 5.959 0.001 4.557 7.361 True
bus van 4.3922 0.001 2.7508 6.0337 True
car van -1.5667 0.0271 -2.9925 -0.1409 True
----------------------------------------------------
# Visualising through multivariate analysis
sns.pairplot(data=df_clean, vars=df_clean.columns[:5], hue="class")
<seaborn.axisgrid.PairGrid at 0x1850a5a6d60>
# Visualising through boxplot
g = sns.catplot(data=df_clean.iloc[:, 5:], height=5, aspect=3,
row="class",
kind="box")
g.set_xticklabels(rotation=30)
<seaborn.axisgrid.FacetGrid at 0x1850a66b0d0>
# Visualising through boxplot
fig, axes = plt.subplots(5, 1, figsize=(15, 25))
ax = sns.boxplot(y="class", x=df_clean.columns[0], data=df_clean.drop(df_clean.columns[5:-1], axis=1), ax=axes[0])
ax = sns.boxplot(y="class", x=df_clean.columns[1], data=df_clean.drop(df_clean.columns[5:-1], axis=1), ax=axes[1])
ax = sns.boxplot(y="class", x=df_clean.columns[2], data=df_clean.drop(df_clean.columns[5:-1], axis=1), ax=axes[2])
ax = sns.boxplot(y="class", x=df_clean.columns[3], data=df_clean.drop(df_clean.columns[5:-1], axis=1), ax=axes[3])
ax = sns.boxplot(y="class", x=df_clean.columns[4], data=df_clean.drop(df_clean.columns[5:-1], axis=1), ax=axes[4])
# g = sns.FacetGrid(data=df_clean.drop(df_clean.columns[5:-1], axis=1), height=5, aspect=3,
# row="class")
# g.map(sns.boxplot, df_clean.columns[:5])
# g.set_xticklabels(rotation=30)
# Seperating Predictor variables, we know the target variable "Class" is the last column
X = df_clean.iloc[:, :-1]
# Seperating Target variables, we know the target variable is called "Class"
y = df_clean["class"]
# Label Encoding target variable
y = y.replace({'car': 0, 'bus': 1, 'van': 2})
# Performing the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Verifying the proportions of y_train and y_test is the same as original dataset
print("y_train proportions")
print(np.unique(y_train, return_counts=True)[1]/len(y_train))
print("y_test proportions")
print(np.unique(y_test, return_counts=True)[1]/len(y_test))
y_train proportions [0.50769231 0.25230769 0.24 ] y_test proportions [0.50920245 0.25153374 0.2392638 ]
# parameter grid
param_grid = {"C": [0.001, 0.01, 0.1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 10, 100]#,
# 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
# Creating the SVC model
model = svm.SVC(class_weight="balanced")
# run grid search on 3 folds
folds = 3
grid_search_SVC = GridSearchCV(model,
cv = folds,
param_grid=param_grid,
return_train_score=True,
verbose = 1)
# fit
grid_search_SVC.fit(X_train, y_train)
Fitting 3 folds for each of 25 candidates, totalling 75 fits
GridSearchCV(cv=3, estimator=SVC(class_weight='balanced'),
param_grid={'C': [0.001, 0.01, 0.1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 10, 100]},
return_train_score=True, verbose=1)
# cv results
cv_results = pd.DataFrame(grid_search_SVC.cv_results_)
cv_results.sort_values("rank_test_score")
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_gamma | params | split0_test_score | split1_test_score | split2_test_score | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 24 | 0.025942 | 8.016739e-04 | 0.015615 | 4.630551e-04 | 100 | 100 | {'C': 100, 'gamma': 100} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 22 | 0.025587 | 4.855319e-04 | 0.015637 | 4.780470e-04 | 100 | 0.1 | {'C': 100, 'gamma': 0.1} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 21 | 0.027248 | 4.787153e-04 | 0.016301 | 4.620829e-04 | 100 | 0.01 | {'C': 100, 'gamma': 0.01} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 20 | 0.025930 | 7.018853e-07 | 0.015625 | 4.701905e-04 | 100 | 0.001 | {'C': 100, 'gamma': 0.001} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 19 | 0.025599 | 4.712583e-04 | 0.015282 | 4.781003e-04 | 10 | 100 | {'C': 10, 'gamma': 100} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 18 | 0.025276 | 4.632243e-04 | 0.015292 | 4.708647e-04 | 10 | 10 | {'C': 10, 'gamma': 10} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 17 | 0.025601 | 4.708117e-04 | 0.015279 | 4.558096e-04 | 10 | 0.1 | {'C': 10, 'gamma': 0.1} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 16 | 0.026787 | 2.021402e-04 | 0.015624 | 4.698536e-04 | 10 | 0.01 | {'C': 10, 'gamma': 0.01} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 15 | 0.026927 | 1.410402e-03 | 0.016290 | 9.404367e-04 | 10 | 0.001 | {'C': 10, 'gamma': 0.001} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 23 | 0.024922 | 1.559087e-05 | 0.015637 | 4.774092e-04 | 100 | 10 | {'C': 100, 'gamma': 10} | 0.506912 | 0.506912 | 0.509259 | 0.507695 | 0.001106 | 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 0 | 0.027792 | 4.042894e-03 | 0.016622 | 9.399309e-04 | 0.001 | 0.001 | {'C': 0.001, 'gamma': 0.001} | 0.253456 | 0.253456 | 0.513889 | 0.340267 | 0.122769 | 11 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 5 | 0.025253 | 4.784323e-04 | 0.017276 | 4.541185e-04 | 0.01 | 0.001 | {'C': 0.01, 'gamma': 0.001} | 0.253456 | 0.253456 | 0.513889 | 0.340267 | 0.122769 | 11 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 9 | 0.023271 | 4.704713e-04 | 0.015625 | 4.704712e-04 | 0.01 | 100 | {'C': 0.01, 'gamma': 100} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 8 | 0.023271 | 4.705980e-04 | 0.015625 | 4.701903e-04 | 0.01 | 10 | {'C': 0.01, 'gamma': 10} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 7 | 0.025255 | 4.782069e-04 | 0.016622 | 4.691816e-04 | 0.01 | 0.1 | {'C': 0.01, 'gamma': 0.1} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 6 | 0.028258 | 1.694799e-03 | 0.019268 | 4.796634e-04 | 0.01 | 0.01 | {'C': 0.01, 'gamma': 0.01} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 4 | 0.025933 | 2.156117e-03 | 0.017630 | 9.543187e-04 | 0.001 | 100 | {'C': 0.001, 'gamma': 100} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 3 | 0.024945 | 1.426699e-03 | 0.016611 | 9.487434e-04 | 0.001 | 10 | {'C': 0.001, 'gamma': 10} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 2 | 0.029254 | 1.694905e-03 | 0.018950 | 2.973602e-07 | 0.001 | 0.1 | {'C': 0.001, 'gamma': 0.1} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 1 | 0.023603 | 4.701903e-04 | 0.016622 | 9.402119e-04 | 0.001 | 0.01 | {'C': 0.001, 'gamma': 0.01} | 0.253456 | 0.253456 | 0.509259 | 0.338724 | 0.120587 | 13 | 0.251732 | 0.251732 | 0.760369 | 0.421278 | 0.239774 |
| 10 | 0.024601 | 1.243879e-03 | 0.015957 | 0.000000e+00 | 0.1 | 0.001 | {'C': 0.1, 'gamma': 0.001} | 0.239631 | 0.239631 | 0.513889 | 0.331051 | 0.129286 | 21 | 0.240185 | 0.240185 | 0.760369 | 0.413579 | 0.245217 |
| 11 | 0.023936 | 8.146854e-04 | 0.016623 | 4.701903e-04 | 0.1 | 0.01 | {'C': 0.1, 'gamma': 0.01} | 0.239631 | 0.239631 | 0.509259 | 0.329507 | 0.127104 | 22 | 0.240185 | 0.240185 | 0.760369 | 0.413579 | 0.245217 |
| 13 | 0.023936 | 8.141987e-04 | 0.016290 | 4.701903e-04 | 0.1 | 10 | {'C': 0.1, 'gamma': 10} | 0.239631 | 0.239631 | 0.509259 | 0.329507 | 0.127104 | 22 | 0.240185 | 0.240185 | 0.760369 | 0.413579 | 0.245217 |
| 14 | 0.022938 | 4.495664e-07 | 0.015625 | 4.701903e-04 | 0.1 | 100 | {'C': 0.1, 'gamma': 100} | 0.239631 | 0.239631 | 0.509259 | 0.329507 | 0.127104 | 22 | 0.240185 | 0.240185 | 0.760369 | 0.413579 | 0.245217 |
| 12 | 0.022938 | 2.973602e-07 | 0.015292 | 4.700779e-04 | 0.1 | 0.1 | {'C': 0.1, 'gamma': 0.1} | 0.239631 | 0.239631 | 0.509259 | 0.329507 | 0.127104 | 22 | 0.240185 | 0.240185 | 0.760369 | 0.413579 | 0.245217 |
# Initialise PCA
pca = PCA(svd_solver='randomized', random_state=42)
scale = StandardScaler()
# Fit PCA to the required features after scaling
pca.fit(pd.DataFrame(scale.fit_transform(df_clean.iloc[:, :-1])))
# Display the screeplot to determine how many componenets to keep
fig = plt.figure(figsize = (12, 8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("Number of Components")
plt.xticks(range(len(pca.explained_variance_ratio_)), range(1, len(pca.explained_variance_ratio_)+1))
plt.ylabel("Cumulative Explained Variance")
plt.title("Screeplot")
plt.grid()
plt.show()
# Display amount of variance explained
print("First 8 components explain %2.5f%% of the variance of the selected features"
%(sum(pca.explained_variance_ratio_[:8])*100))
First 8 components explain 96.24240% of the variance of the selected features
# Reinitialising PCA with the required number of components
pca = PCA(n_components=8, svd_solver='randomized', random_state=42)
# Assigning new features to the auto dataset and dropping compressed features
df_clean.loc[:, ["PC1", "PC2", "PC3", "PC4",
"PC5", "PC6", "PC7", "PC8"]] = pca.fit_transform(pd.DataFrame(scale.fit_transform(df_clean.iloc[:, :-1])))
df_clean.drop(df_clean.columns[:-9], axis=1, inplace = True)
# Displaying the new pairplot
sns.pairplot(df_clean, hue="class")
<seaborn.axisgrid.PairGrid at 0x1856f3129d0>
# Seperating Target variables, we know the target variable is called "class"
y = df_clean.pop("class")
# Label Encoding target variable
y = y.replace({'car': 0, 'bus': 1, 'van': 2})
# Seperating Predictor variables as X
X = df_clean
# Performing the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Verifying the proportions of y_train and y_test is the same as original dataset
print("y_train proportions")
print(np.unique(y_train, return_counts=True)[1]/len(y_train))
print("y_test proportions")
print(np.unique(y_test, return_counts=True)[1]/len(y_test))
y_train proportions [0.50769231 0.25230769 0.24 ] y_test proportions [0.50920245 0.25153374 0.2392638 ]
# parameter grid
param_grid = {"C": [0.001, 0.01, 0.1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 10, 100],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
# Creating the SVC model
model = svm.SVC(class_weight="balanced")
# run grid search on 3 folds
folds = 3
grid_search_SVC = GridSearchCV(model,
cv = folds,
param_grid=param_grid,
return_train_score=True,
verbose = 1)
# fit
grid_search_SVC.fit(X_train, y_train)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
GridSearchCV(cv=3, estimator=SVC(class_weight='balanced'),
param_grid={'C': [0.001, 0.01, 0.1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 10, 100],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
return_train_score=True, verbose=1)
# cv results
cv_results = pd.DataFrame(grid_search_SVC.cv_results_)
cv_results.sort_values("rank_test_score")
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_gamma | param_kernel | params | split0_test_score | split1_test_score | split2_test_score | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 70 | 0.006981 | 1.123916e-07 | 0.004654 | 4.704150e-04 | 10 | 0.1 | rbf | {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'} | 0.917051 | 0.921659 | 0.925926 | 0.921545 | 0.003624 | 1 | 0.990762 | 0.997691 | 0.993088 | 0.993847 | 0.002879 |
| 90 | 0.006659 | 4.762496e-04 | 0.004976 | 1.601846e-05 | 100 | 0.1 | rbf | {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'} | 0.903226 | 0.926267 | 0.930556 | 0.920016 | 0.012001 | 2 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
| 86 | 0.005984 | 2.247832e-07 | 0.003657 | 4.700217e-04 | 100 | 0.01 | rbf | {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'} | 0.898618 | 0.926267 | 0.912037 | 0.912307 | 0.011290 | 3 | 0.965358 | 0.960739 | 0.976959 | 0.967685 | 0.006823 |
| 66 | 0.005984 | 2.973602e-07 | 0.004322 | 4.702464e-04 | 10 | 0.01 | rbf | {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} | 0.894009 | 0.930876 | 0.893519 | 0.906134 | 0.017496 | 4 | 0.933025 | 0.939954 | 0.940092 | 0.937690 | 0.003299 |
| 69 | 0.017287 | 3.671663e-03 | 0.001995 | 2.973602e-07 | 10 | 0.1 | poly | {'C': 10, 'gamma': 0.1, 'kernel': 'poly'} | 0.903226 | 0.884793 | 0.875000 | 0.887673 | 0.011702 | 5 | 0.981524 | 0.993072 | 0.983871 | 0.986156 | 0.004983 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23 | 0.014960 | 4.495664e-07 | 0.005984 | 5.619580e-07 | 0.01 | 0.001 | sigmoid | {'C': 0.01, 'gamma': 0.001, 'kernel': 'sigmoid'} | 0.253456 | 0.253456 | 0.490741 | 0.332551 | 0.111857 | 94 | 0.251732 | 0.251732 | 0.562212 | 0.355225 | 0.146362 |
| 41 | 0.011637 | 4.707009e-04 | 0.004320 | 4.715463e-04 | 0.1 | 0.001 | poly | {'C': 0.1, 'gamma': 0.001, 'kernel': 'poly'} | 0.239631 | 0.239631 | 0.513889 | 0.331051 | 0.129286 | 97 | 0.240185 | 0.240185 | 0.509217 | 0.329862 | 0.126823 |
| 54 | 0.016301 | 4.393388e-04 | 0.011647 | 4.543431e-04 | 0.1 | 10 | rbf | {'C': 0.1, 'gamma': 10, 'kernel': 'rbf'} | 0.239631 | 0.239631 | 0.509259 | 0.329507 | 0.127104 | 98 | 0.240185 | 0.240185 | 0.760369 | 0.413579 | 0.245217 |
| 58 | 0.022273 | 4.705837e-04 | 0.014960 | 8.145881e-04 | 0.1 | 100 | rbf | {'C': 0.1, 'gamma': 100, 'kernel': 'rbf'} | 0.239631 | 0.239631 | 0.509259 | 0.329507 | 0.127104 | 98 | 0.240185 | 0.240185 | 0.760369 | 0.413579 | 0.245217 |
| 43 | 0.016600 | 4.776873e-04 | 0.006317 | 4.698531e-04 | 0.1 | 0.001 | sigmoid | {'C': 0.1, 'gamma': 0.001, 'kernel': 'sigmoid'} | 0.239631 | 0.239631 | 0.490741 | 0.323334 | 0.118374 | 100 | 0.240185 | 0.240185 | 0.562212 | 0.347527 | 0.151805 |
100 rows × 19 columns
# Creating the SVC model with the best hyper parameters
model = svm.SVC(class_weight="balanced", random_state=42, C=10, gamma=0.1, kernel='rbf')
# Fit the model to the entire train set
model.fit(X_train, y_train)
# Get Predictions
y_pred = model.predict(X_test)
# Create confusion matrix dataframe
df_cm = pd.crosstab(y_test, y_pred)
# Display confusion matrix dataframe
df_cm
| col_0 | 0 | 1 | 2 |
|---|---|---|---|
| class | |||
| 0 | 78 | 1 | 4 |
| 1 | 3 | 38 | 0 |
| 2 | 2 | 2 | 35 |
# Creating a dictionary with the class information
dict_classes = {0: "car", 1: 'bus', 2: 'van'}
# Displaying the statistics of the classification Report
# Initialising precision and recall variables
prec_avg = 0
rec_avg = 0
for i in range(df_cm.shape[0]):
# Calculating Precision and Recall
prec = df_cm.iloc[i, i]/sum(df_cm.iloc[:, i])
rec = df_cm.iloc[i, i]/sum(df_cm.iloc[i, :])
#Totalling up precision and recall for each label
prec_avg += prec
rec_avg += rec
print(dict_classes[i])
print("Precision: ", prec)
print("Recall: ", rec)
print()
prec_avg = prec_avg/3
rec_avg = rec_avg/3
f1_avg = 2 * (prec_avg * rec_avg)/(prec_avg+rec_avg)
print("F1 Score of the entire model", f1_avg)
car Precision: 0.9397590361445783 Recall: 0.9397590361445783 bus Precision: 0.926829268292683 Recall: 0.926829268292683 van Precision: 0.8974358974358975 Recall: 0.8974358974358975 F1 Score of the entire model 0.9213414006243862
• DOMAIN: Sports management
• CONTEXT: Company X is a sports management company for international cricket.
• DATA DESCRIPTION: The data is collected belongs to batsman from IPL series conducted so far. Attribute Information:
• PROJECT OBJECTIVE: Goal is to build a data driven batsman ranking model for the sports management company to make business decisions.
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import random
# Load the dataset
df_ipl = pd.read_csv("Part4 - batting_bowling_ipl_bat.csv")
# Display basic information
df_ipl.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 180 entries, 0 to 179 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 90 non-null object 1 Runs 90 non-null float64 2 Ave 90 non-null float64 3 SR 90 non-null float64 4 Fours 90 non-null float64 5 Sixes 90 non-null float64 6 HF 90 non-null float64 dtypes: float64(6), object(1) memory usage: 10.0+ KB
# Removing the blank rows
df_ipl = df_ipl[df_ipl.notnull().any(axis=1)].reset_index(drop=True).copy()
# Displaying boxplot of the dataset
sns.boxplot(data=df_ipl.iloc[:, 1:])
<AxesSubplot:>
ls_index = []
# Looping through all continous features
for col in list(df_ipl.columns[1:]):
# Display Univariate analysis, Histogram and Boxplot
f, axes = plt.subplots(1, 2, figsize=(17,7))
sns.boxplot(x = col, data=df_ipl, orient='h' , ax=axes[1])
sns.histplot(df_ipl[col], ax=axes[0])
axes[0].set_title('Distribution plot')
axes[1].set_title('Box plot')
plt.show()
# checking count of outliers.
q25, q75 = np.percentile(df_ipl[col], 25), np.percentile(df_ipl[col], 75)
IQR = q75-q25
threshold=IQR*1.5
lower, upper = q25 - threshold, q75 + threshold
outliers = [i for i in df_ipl[col] if i < lower or i > upper]
# Store the index of the outlier
ls_index.extend([i for i in df_ipl[col].index if df_ipl.loc[i, col] > upper])
print('{} Total Number of outliers in {}: {}'.format('\033[1m', col, len(outliers)))
Total Number of outliers in Runs: 1
Total Number of outliers in Ave: 3
Total Number of outliers in SR: 5
Total Number of outliers in Fours: 3
Total Number of outliers in Sixes: 1
Total Number of outliers in HF: 2
# Drop duplicates and display the records of the outliers of each feature
df_ipl.iloc[list(set(ls_index)), :]
| Name | Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|---|
| 0 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 |
| 1 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 |
| 5 | AM Rahane | 560.0 | 40.00 | 129.33 | 73.0 | 10.0 | 5.0 |
| 6 | KP Pietersen | 305.0 | 61.00 | 147.34 | 22.0 | 20.0 | 3.0 |
| 9 | JP Duminy | 244.0 | 81.33 | 128.42 | 13.0 | 11.0 | 2.0 |
| 19 | R Dravid | 462.0 | 28.87 | 112.13 | 63.0 | 4.0 | 2.0 |
# Create the scatterplot
sns.set_style("dark")
plt.figure(figsize=(15, 10))
ax = sns.scatterplot(data=df_ipl, x="Ave", y="SR", hue="HF", size="Runs", sizes=(10,500),
style="Sixes", palette="dark", x_jitter=True, markers = ["o", "o", "o", "o", "o", "o", "o", "o", "o", "o", "v",
"v","v","v","v","v","v","v","v","v","*"])
# Mark the coordinates with player names
def label_point(x, y, val, ax):
a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
for i, point in a.iterrows():
ax.text(point['x']+.9, point['y'], str(point['val']))
label_point(df_ipl.Ave, df_ipl.SR, df_ipl.Name, plt.gca())
# Create the Aggressiveness ranks
df_ipl.sort_values(["SR", "Sixes", "Fours"], ascending=False, inplace=True)
df_ipl["Aggressiveness"] = np.arange(1, len(df_ipl)+1)
# Create the Reliability ranks
df_ipl.sort_values(["Ave", "Runs", "Fours"], ascending=False, inplace=True)
df_ipl["Reliability"] = np.arange(1, len(df_ipl)+1)
# Create the Milestones ranks
df_ipl.sort_values(["HF"], ascending=False, inplace=True)
df_ipl["Milestones"] = np.arange(1, len(df_ipl)+1)
# Create the Overall ranks
df_ipl["Overall"] = (df_ipl["Aggressiveness"] * 4) + (df_ipl["Reliability"] * 3) + (df_ipl["Milestones"])
df_ipl.sort_values(["Overall"], ascending=True, inplace=True)
df_ipl["Overall"] = np.arange(1, len(df_ipl)+1)
# Reset dataframe by index
df_ipl.sort_index(inplace=True)
# Display the complete dataset
df_ipl
| Name | Runs | Ave | SR | Fours | Sixes | HF | Aggressiveness | Reliability | Milestones | Overall | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | 4 | 2 | 1 | 1 |
| 1 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 | 10 | 15 | 2 | 8 |
| 2 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 | 2 | 20 | 3 | 6 |
| 3 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 | 8 | 5 | 5 | 2 |
| 4 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 | 29 | 8 | 6 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 85 | Z Khan | 12.0 | 6.00 | 70.58 | 1.0 | 0.0 | 0.0 | 86 | 84 | 64 | 86 |
| 86 | WD Parnell | 19.0 | 4.75 | 70.37 | 2.0 | 0.0 | 0.0 | 87 | 87 | 56 | 87 |
| 87 | PC Valthaty | 30.0 | 5.00 | 58.82 | 4.0 | 0.0 | 0.0 | 88 | 86 | 66 | 89 |
| 88 | RP Singh | 6.0 | 3.00 | 50.00 | 0.0 | 0.0 | 0.0 | 89 | 89 | 48 | 88 |
| 89 | R Sharma | 2.0 | 0.50 | 18.18 | 0.0 | 0.0 | 0.0 | 90 | 90 | 90 | 90 |
90 rows × 11 columns
# Create an error list
ls_fdbk = ["Please follow the rules of the game", "You are failing under pressure try again",
"Howzzat something you do not get?", "This will get boring if you do not follow the rules of the pitch",
"Please deliver something I can process", "This game was not made for you"]
# Cosmetics for user input
print("Select one of the following ranking system :-")
print("\n(A) Aggressiveness\n(R) Reliability\n(M) Milestones\n(O) Overall")
# Validation of inputs
while True:
r = str.lower(input("Select the ranking system chosen based on above: ")[0])
if r in ["a", "r", "m", "o"]:
break
else:
print(ls_fdbk[int(np.round(random.random()*5))])
while True:
try:
n = int(input("\nNow please select the number of top players you need "))
if n>0 and n<=90:
break
elif n==0:
print("If you were gonna choose 0 then why waste my time :-(")
break
except:
print("Just a number between 1 and 90 please :-)")
# Displaying User defined results
print("\nNames in order from rank 1 going down\n")
if r == "a":
print(list(df_ipl.sort_values("Aggressiveness")["Name"].head(n)))
if r == "r":
print(list(df_ipl.sort_values("Reliability")["Name"].head(n)))
if r == "m":
print(list(df_ipl.sort_values("Milestones")["Name"].head(n)))
if r == "o":
print(list(df_ipl.sort_values("Overall")["Name"].head(n)))
Select one of the following ranking system :- (A) Aggressiveness (R) Reliability (M) Milestones (O) Overall Select the ranking system chosen based on above: O Now please select the number of top players you need 5 Names in order from rank 1 going down ['CH Gayle', 'CL White', 'AB de Villiers', 'KP Pietersen', 'DA Warner']
# importing required Libraries
import matplotlib.image as mplib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
# Reading an image and printing the shape of the image.
img = mplib.imread('pic.jfif')
print(img.shape)
plt.imshow(img)
(266, 474, 3)
<matplotlib.image.AxesImage at 0x185175900d0>
# Reshape the image into 2D form
img_r = np.reshape(img, (266, 1422))
print(img_r.shape)
(266, 1422)
# Reducing Dimensions
pca = PCA(n_components=32)
img_r = pca.fit_transform(img_r)
print(img_r.shape)
print(np.sum(pca.explained_variance_ratio_))
(266, 32) 0.9882504934529224
###### Reversing the dimensionality reduction however, some portion of the information has been lost
image = pca.inverse_transform(img_r)
print(image.shape)
plt.imshow(image)
(266, 1422)
<matplotlib.image.AxesImage at 0x185175ef8e0>